Recipe

Get the data

Extract the Top 1000 popular vegetarian recipes and their ingredient using Spoonacular API.

# the API key of Spoonacular is stored as environment variable "spoonacular_api_key" and "spoonacular_api_key_ljy"

# store and retrive key to environment 
#Sys.setenv(spoonacular_api_key = "")
#Sys.getenv("spoonacular_api_key")


combined <- data.frame(recipe_id = integer(),
                       recipe_name = character())

for (n in seq(0,900,100)) {
  request <- httr::GET(
    "https://api.spoonacular.com/recipes/complexSearch", 
    query = list(apiKey = Sys.getenv("spoonacular_api_key"),
                 diet = "vegetarian",
                 number = 100,
                 offset = n, # Spoonacular API only allows this to range from 0 to 900
                 sort = "popularity",
                 sortDirection = "desc"
                 )
    )  
  
  response <- jsonlite::fromJSON(httr::content(request, as = "text", encoding = "UTF-8"), flatten = TRUE)[[1]] %>%
  dplyr::select(recipe_id = id, recipe_name = title)
  
  combined <- rbind(combined, response)
  
  print(paste("extracted", n + 100, "recipes"))
}

save(combined, file = "combined_recipe_id.RData")
combined <- combined %>%
  mutate(ingredient = NA)

for (i in 1:nrow(combined)) {
  recipe_id <- combined[i, "recipe_id"]
  request <- httr::GET(
    paste0("https://api.spoonacular.com/recipes/",recipe_id,"/ingredientWidget.json"), 
    query = list(apiKey = Sys.getenv("spoonacular_api_key")))
  
  response <- jsonlite::fromJSON(httr::content(request, as = "text", encoding = "UTF-8"), flatten = TRUE)[[1]] 
  
  ingredient <- response[["name"]]
  
  combined[i, "ingredient"] <- paste(ingredient, collapse = "//")
}

save(combined, file = "combined_recipe_id_ingredient.RData")

Preprocess

library(stringr)
library(SemNetCleaner)

load("combined_recipe_id_ingredient.RData")

allIngredient <- paste(combined$ingredient, collapse = "//")
ingredientList <- strsplit(allIngredient, "//")
ingredientDf <- data.frame(ingredientList, stringsAsFactors = FALSE)
names(ingredientDf) <- "ingredient"

ingredientDf <- ingredientDf %>%
  mutate(ingredient = case_when(ingredient %in% c("bread flour", "all purpose flour","gluten-free gluten free all purpose baking flour","whole wheat white flour","canned all purpose flour","plain flour","AP flour","plain all purpose flour","gluten free flour","gluten-free flour","unbleached flour","unbleached all purpose flour","white bread flour") ~ "flour",
                                ingredient %in% c("whole wheat flour","white whole wheat flour","white whole wheat pastry flour","whole-wheat pastry flour") ~ "whole wheat flour",
                                ingredient == "ground cinnamon" ~ "cinnamon",
                                ingredient == "baking powder" ~ "baking soda",
                                ingredient == "garlic cloves" ~ "garlic",
                                ingredient %in% c("fresh lemon juice", "lemon (juice)") ~ "lemon juice",
                                ingredient == "egg yolks" ~ "egg yolk",
                                ingredient == "egg whites" ~ "egg white",
                                ingredient %in% c("unsalted butter", "salted butter", "unsalted salted butter") ~ "butter",
                                ingredient == "half n half cream" ~ "half & half cream",
                                ingredient %in% c("red onion", "vidalia onion", "yellow onion", "white onion", "sweet onion", "red white yellow onion", "red diced onion", "white yellow onion", "yellow onions", "red onions", "sweet onions", "onions") ~ "onion",
                                ingredient %in% c("green onions", "spring onions", "scallions", "green white scallions", "scallion", "green white light scallions") ~ "green onion",
                                ingredient == "apples" ~ "apple",
                                ingredient == "basil leaves" ~ "basil",
                                ingredient == "bay leaves" ~ "bay leaf",
                                ingredient %in% c("barbecue sauce", "Barbeque Dipping Sauce") ~ "bbq sauce",
                                TRUE ~ ingredient))

ingredientDf[str_detect(ingredientDf$ingredient, "\\b(apple|apples)$"),] <- "apple" # because the singularize() function has trouble dealing with "apples", so we have to move this one up to here

ingredientDf <- ingredientDf %>%
  mutate(ingredient = str_replace(ingredient, "^(dried |diced |fresh |frozen |whole )+(.*)$", "\\2")) # remove leading dried, diced ...

ingredientDf$ingredient <- unlist(map(ingredientDf$ingredient, singularize)) # plural to single, takes a few minutes to run

# combine some too detailed ingredient
ingredientDf[str_detect(ingredientDf$ingredient, "sugar"),]  <- "sugar"
ingredientDf[str_detect(ingredientDf$ingredient, "\\bsalt\\b"),] <- "salt"
ingredientDf[str_detect(ingredientDf$ingredient, "flour tortillas"),] <- "flour tortillas"
ingredientDf[str_detect(ingredientDf$ingredient, "coconut milk"),] <- "coconut milk"
ingredientDf[str_detect(ingredientDf$ingredient, "peanut butter"),] <- "peanut butter"
ingredientDf[str_detect(ingredientDf$ingredient, "oil$"),] <- "oil"
ingredientDf[str_detect(ingredientDf$ingredient, "cheese$"),] <- "cheese"
ingredientDf[str_detect(ingredientDf$ingredient, "sour cream$"),] <- "sour cream"

ingredientDfCleaned <- ingredientDf %>%
  group_by(ingredient) %>%
  summarise(cnt = n()) %>%
  ungroup() %>%
  arrange(desc(cnt))

save(ingredientDfCleaned, file = "ingredientDfCleaned.RData")
# frequency word in recipe title

library(tm) 

load("combined_recipe_id_ingredient.RData")

myStopWords <- c("quick", "best", "black", "homemade")

allTitle <- paste(combined$recipe_name, collapse = " ") %>%
  removeNumbers() %>%
  removePunctuation() %>%
  tolower() %>%
  removeWords(c(stopwords("english"), myStopWords)) %>%
  stripWhitespace()

titleList <- strsplit(allTitle, " ")
titleDf <- data.frame(titleList, stringsAsFactors = FALSE) 
names(titleDf) <- "word"
titleDf <- titleDf %>%
  group_by(word) %>%
  tally() %>%
  ungroup()

save(titleDf, file="titleWordCount.RData")

Visualization

Wordcloud

If all ingredients are included …

library(wordcloud)

wordcloudDf <- ingredientDfCleaned 

set.seed(1997)
wordcloud(wordcloudDf$ingredient, wordcloudDf$cnt, scale = c(2,.5), min.freq = 0,
          max.words = 100, 
          colors = "#41ab5d")

Exclude ingredients that are too rare (appeared less than 5 times) or too common (more than 100 times).

wordcloudDf <- ingredientDfCleaned %>%
  filter(cnt > 10 & cnt < 100) 

set.seed(1997)
wordcloud(wordcloudDf$ingredient, wordcloudDf$cnt, scale = c(1.5,.25), min.freq = 0,
          max.words = 100, 
          colors = brewer.pal(max(7, ncol(wordcloudDf$cnt)),"Greens"))

Play around with the title of the recipes…

load("titleWordCount.RData")

titleDf <- titleDf %>%
  filter(n > 10) 

set.seed(1997)
wordcloud(titleDf$word, titleDf$n, scale = c(1.5,.25), min.freq = 0,
          max.words = 100, 
          colors = brewer.pal(max(7, ncol(titleDf$n)),"Greens"))

Bar

Although bar plot is more informative than wordcloud, I found they look overly simple. Any suggestion about how to make them cooler?

static

library(ggplot2)
library(ggthemes)

top20Ingredient <- ingredientDfCleaned %>%
  filter(cnt > 10 & cnt < 100) %>%
  top_n(n = 20, wt = cnt) %>%
  mutate(ingredient = fct_reorder(ingredient, cnt))

ggplot(top20Ingredient) + 
  geom_bar(aes(x = ingredient, y = cnt),
           stat = "identity", position = "identity", fill = "#639a67", alpha = 0.8) +  
  geom_text(aes(x = ingredient, y = cnt, label = cnt), 
            color = "white", fontface = "bold", size = 3, hjust = 1.5) + 
  coord_flip() + 
  
  labs(
    title = "Need a Little Greenness",
    subtitle = "Number of times each ingredient appears in the top 1000 recipes.",
    caption = "
    Note: Ingredients appear in less than 10 recipes or more than 300 recipes are excluded.
    Data Source: https://spoonacular.com/food-api/
    "
  ) + 
  
  theme_hc() +
  
  theme(
    panel.grid.major.y = element_blank(),
    axis.ticks = element_blank(),
    axis.text.x = element_blank(), 
    axis.text.y = element_text(color = "black", size = 10),
    axis.title.x = element_blank(),
    axis.title.y = element_blank(),
    
    legend.position = "none",
    
    plot.caption=element_text(size=7, color="grey40"),
    plot.subtitle = element_text(size=10, color="grey60", face="italic"),
  )

interactive

library(plotly)
library(forcats)

top20Ingredient <- ingredientDfCleaned %>%
  filter(cnt > 10 & cnt < 100) %>%
  top_n(n = 20, wt = cnt) %>%
  mutate(ingredient = fct_reorder(ingredient, cnt))


plot_ly(top20Ingredient,
        x = ~cnt, y = ~ingredient, name = "",
        type = "bar", orientation = "h",
        hovertemplate = paste('%{y}', '<br>%{x} recipes<br>'),
        marker = list(
          color = "#639a67",
          line = list(width = 0)
        )) %>%
  layout(title = "Popular Ingredients in Vegetarian Recipes",
         xaxis = list(title = "# of recipes with the ingredient"),
         yaxis = list(title = ""),
         annotations = list(x = 0.21, y = -0.115, 
                            text = "Note: Ingredients appear in less than 10 recipes or \nmore than 300 recipes are excluded.",
                            showarrow = F, xref='paper', yref='paper', 
                            xanchor='right', yanchor='auto', xshift=0, yshift=0,align="left",
                            font=list(size=10, color="grey")))